This notebook accompanies the manuscript titled, "Parent Reflections on ABA: Analysis of 5,450 Open-Ended Responses" and submitted to Behavior Analysis in Practice. The notebook is structured as follows:
# System stuff
import warnings
warnings.filterwarnings('ignore')
# Data manipulation
import pandas as pd
import numpy as np
import string
import re
from collections import Counter
import random
import collections
# Data analysis
from scipy import stats
from scipy.stats import skew
from scipy.stats import skewtest
# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.colors as mcolors
palette='gist_earth_r'
# NLP Packages
from sklearn.impute import KNNImputer
try:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
except:
!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
try:
import spacy
except:
!pip install spacy
import spacy
from spacy.lang.en import English
parser = English()
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
from nltk.util import ngrams
from nltk.corpus import stopwords
en_stop = set(nltk.corpus.stopwords.words('english'))
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet as wn
try:
import gensim
except:
!pip install gensim
import gensim
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
try:
import pyLDAvis
except:
!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models
import pickle
# Set directory to Google Drive
from google.colab import drive
drive.mount('/content/gdrive')
%cd './gdrive/My Drive/Patient-Satisfaction-NLP/Scripts and Notebooks/'
def bar_plot(x_col, y_col, data, hue=None, dodge=True,
x_label=None, x_rot=0, x_ticks=None, x_tick_lab=None,
y_label=None, y_rot=0, y_ticks=None, y_tick_lab=None, y_min=0, y_max=None,
title=None, save_name=None, palette=palette, legend=False, figsize=(10, 7),
ypad=125):
"""
Parameters
----------
data : TYPE
DESCRIPTION.
Returns
-------
None.
"""
plt.figure(figsize=figsize)
ax = sns.barplot(x=x_col, y=y_col, hue=hue, dodge=dodge,
data=data, palette=palette)
plt.xlabel(x_label, fontsize=30, labelpad=(24))
plt.xticks(ticks=x_ticks, labels=x_tick_lab, fontsize=22, rotation=x_rot)
plt.ylabel(y_label, fontsize=30, labelpad=(ypad), rotation=y_rot)
plt.yticks(ticks=y_ticks,
labels=y_tick_lab,
fontsize=22)
plt.ylim(y_min, y_max)
plt.title(label=title, fontsize=30, pad=40)
right_side = ax.spines["right"]
right_side.set_visible(False)
top = ax.spines["top"]
top.set_visible(False)
if legend==True:
plt.legend(fontsize=16, loc="best")
plt.savefig(f"../figures/{save_name}.png", bbox_inches='tight')
plt.show()
def barplot(data,x,y,figsize,xlabel,title,ylabel, rotation=False):
"""
Parameters
----------
data : TYPE
DESCRIPTION.
Returns
-------
None.
"""
fig, ax = plt.subplots(figsize=figsize)
ax = sns.barplot(x=x, y=y, data=data,color='black')
plt.xlabel('',fontsize=30,labelpad=(16))
plt.yticks(fontsize=50)
if rotation==True:
plt.xticks(fontsize=50, rotation=45)
else:
plt.xticks(fontsize=50)
plt.ylabel(ylabel,fontsize=35,labelpad=(16))
plt.title(title,fontsize=70,pad=40)
right_side=ax.spines['right']
right_side.set_visible(False)
top = ax.spines['top']
top.set_visible(False)
plt.tight_layout()
plt.show()
def histplot(x_col, data, bins='auto', x_label=None, x_rot=0,
x_ticks=None, x_tick_lab=None, xmin=1, xmax=10,
y_label=None, y_rot=0, y_ticks=None,
y_tick_lab=None, y_min=0, y_max=None,
title=None, save_name=None, cum=False,
stat='probability', figsize=(10, 7)):
"""
Parameters
----------
data : TYPE
DESCRIPTION.
Returns
-------
None.
"""
plt.figure(figsize=figsize)
ax = sns.histplot(data=data, x=x_col, bins=bins, cumulative=cum, stat=stat)
plt.xlabel(x_label, fontsize=30, labelpad=(24))
plt.xticks(ticks=x_ticks, labels=x_tick_lab, fontsize=22, rotation=x_rot)
plt.xlim(xmin, xmax)
plt.ylabel(y_label, fontsize=30, labelpad=(125), rotation=y_rot)
plt.yticks(ticks=y_ticks,
labels=y_tick_lab,
fontsize=22)
plt.ylim(y_min, y_max)
plt.title(label=title, fontsize=30, pad=40)
right_side = ax.spines["right"]
right_side.set_visible(False)
top = ax.spines["top"]
top.set_visible(False)
plt.savefig(f"../figures/{save_name}.png", bbox_inches='tight')
plt.show()
def line_plot(df, col, save_name):
"""
Parameters
----------
data : TYPE
DESCRIPTION.
Returns
-------
None.
"""
plt.figure(figsize=(10, 7))
sns.lineplot(x='topics', y=col, marker='o',
size=15, color='k', data=df,
legend=False)
plt.ylabel(col, fontsize=24, labelpad=20)
plt.yticks(fontsize=18)
plt.xlabel("Topics", fontsize=24, labelpad=20)
plt.xticks(fontsize=18)
plt.savefig(f'../figures/{save_name}.png', bbox_inches='tight')
plt.show()
def remove_more_punct(text):
"""
Parameters
----------
data : TYPE
DESCRIPTION.
Returns
-------
None.
"""
import re
import string
punct = set(string.punctuation)
text = text.lower()
text = "".join([c for c in text if c not in punct])
text = re.sub(r"""[()\’°""#/@;¢€:£<“>{}«®`©”+=~‘|.!?,]""", "", text)
text = re.sub(r'/[^a-zA-Z]',"",text)
text = ' '.join(text.split())
return text
def ngram_analysis(df, ngram_col, num_n=2, num_of_terms_to_display=15, title='Bigrams', figsize=(20,15)):
"""
Parameters
----------
data : TYPE
DESCRIPTION.
Returns
-------
None.
"""
ngrams = df[ngram_col].tolist()
ng_count = collections.Counter([x for sublist in ngrams for x in sublist])
ngram_df = pd.DataFrame.from_dict(ng_count, orient='index').sort_values(by=[0], ascending=False).reset_index(drop=False)
ngram_df.columns = ['index', 'count']
ngram_df['percent'] = 100*round(ngram_df['count']/len(ngram_df), 4)
print('\n\nLength: ', len(ngram_df))
print(ngram_df.iloc[:15, :])
ngram_df['ngram'] = ngram_df['index'].apply(lambda x: remove_more_punct(str(x)))
barplot(data=ngram_df[:num_of_terms_to_display], x='count', y='ngram', figsize=figsize, xlabel='Count',
title=title, ylabel='')
barplot(data=ngram_df[:num_of_terms_to_display], x='percent', y='ngram', figsize=figsize, xlabel='Count',
title=title, ylabel='', rotation=True)
def lda_model_grid(df, col):
"""
Parameters
----------
data : TYPE
DESCRIPTION.
Returns
-------
None.
"""
df_text = []
for i in range(len(df)):
tokens = df[col][i]
df_text.append(tok for tok in tokens)
df_text.append(tokens)
df_dict = corpora.Dictionary(df_text)
df_corpus = [df_dict.doc2bow(text) for text in df[col]]
# Create model and save
topics = []
complexity = []
coherence = []
for i in range(2, 11):
print(f"Checking model with {i} topics.")
warnings.filterwarnings('ignore')
# Build model
lda_model = gensim.models.ldamodel.LdaModel(corpus=df_corpus,
id2word=df_dict,
num_topics=i,
random_state=100,
update_every=1,
chunksize=1000,
passes=10,
alpha='auto',
per_word_topics=True)
# Compute complexity and coherence score
cmplx = lda_model.log_perplexity(df_corpus)
coherence_model_lda = CoherenceModel(model=lda_model,
texts=df[col],
dictionary=df_dict,
coherence='c_v')
coh = coherence_model_lda.get_coherence()
topics.append(i)
complexity.append(cmplx)
coherence.append(coh)
# Save as a df
df_df = pd.DataFrame({'topics': topics,
'complexity':complexity,
'coherence':coherence})
return df_df
def lda_one_topic(df, col, topics, savename, grid_cols, fig_height=5):
"""
Parameters
----------
data : TYPE
DESCRIPTION.
Returns
-------
None.
"""
df_text = []
for i in range(len(df)):
tokens = df[col][i]
df_text.append(tok for tok in tokens)
df_text.append(tokens)
df_dict = corpora.Dictionary(df_text)
df_corpus = [df_dict.doc2bow(text) for text in df[col]]
# Build model
lda_model = gensim.models.ldamodel.LdaModel(corpus=df_corpus,
id2word=df_dict,
num_topics=topics,
random_state=100,
update_every=1,
chunksize=1000,
passes=10,
alpha='auto',
per_word_topics=True)
lda_model.save(f'../data/07_model_output/{savename}.gensim')
all_topics = lda_model.print_topics(num_words=20)
all_topics = pd.DataFrame(all_topics)
all_topics.to_csv(f'../data/07_model_output/topic_words_{savename}.csv')
# Visualize via facet grid
n_words = 10
topic_words = pd.DataFrame({})
for i, topic in enumerate(lda_model.get_topics()):
top_feature_ids = topic.argsort()[-n_words:][::-1]
feature_values = topic[top_feature_ids]
words = [df_dict[id] for id in top_feature_ids]
topic_df = pd.DataFrame({'value': feature_values, 'word': words, 'topic': i})
topic_words = pd.concat([topic_words, topic_df], ignore_index=True)
g = sns.FacetGrid(topic_words, col="topic", col_wrap=grid_cols*2,
sharey=False, height=5, aspect=0.65)
g.map(plt.barh, "word", "value")
plt.ylabel("")
plt.show()
# Visualize via barplot
topics_ = lda_model.show_topics(formatted=False)
data_flat = [w for w_list in df[col] for w in w_list]
counter = Counter(data_flat)
out = []
for i, topic in topics_:
for word, weight in topic:
out.append([word, i , weight, counter[word]])
temp_df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])
# Plot Word Count and Weights of Topic Keywords
fig, axes = plt.subplots(round((topics+1)/2), grid_cols, figsize=(grid_cols*8, round((topics+1)/2)*fig_height), sharey=False, dpi=160)
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
ax.bar(x='word', height="word_count", data=temp_df.loc[temp_df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
ax_twin = ax.twinx()
ax_twin.bar(x='word', height="importance", data=temp_df.loc[temp_df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
ax.set_ylabel('Frequency', color=cols[i], fontsize=20)
wc_height = temp_df['word_count'].max() + int(0.1*(temp_df['word_count'].max()))
we_height = temp_df['importance'].max() + 0.01
ax_twin.set_ylim(0, we_height); ax.set_ylim(0,wc_height)
ax.set_title('Topic: ' + str(i+1), color=cols[i], fontsize=30)
ax.tick_params(axis='y', left=False)
ax.set_xticklabels(temp_df.loc[temp_df.topic_id==i, 'word'], rotation=45, horizontalalignment= 'right', fontsize=20)
ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')
fig.tight_layout(w_pad=2)
plt.show()
# Get topic weights and dominant topics ------------
from sklearn.manifold import TSNE
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
# Get topic weights
topic_weights = []
for i, row_list in enumerate(lda_model[df_corpus]):
topic_weights.append([w for i, w in row_list[0]])
# Array of topic weights
arr = pd.DataFrame(topic_weights).fillna(0).values
# Keep the well separated points (optional)
arr = arr[np.amax(arr, axis=1) > 0.35]
# Dominant topic number in each doc
topic_num = np.argmax(arr, axis=1)
# tSNE Dimension Reduction
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(arr)
# Plot the Topic Clusters using Bokeh
output_notebook()
n_topics = topics
mycolors = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()])
plot = figure(title="t-SNE Clustering of {} LDA Topics".format(n_topics),
plot_width=900, plot_height=700)
plot.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num])
show(plot)
# Visualize using pyLDAvis
pyLDAvis.enable_notebook()
lda_display = pyLDAvis.gensim_models.prepare(lda_model,
df_corpus,
df_dict,
sort_topics=False)
pyLDAvis.display(lda_display)
pyLDAvis.save_html(lda_display, f'../figures/lda_vis_{savename}.html')
return all_topics
*** WARNING: This notebook is not automaticaly attached to a data source. Running these cells will lead to errors and the removal of the visuals.
# Read in the raw data
all_data = pd.read_csv('../data/01_raw/all_raw_text.csv')
all_data = all_data.drop(['Unnamed: 0'], axis=1)
all_data[::910] # view every 800th row
# Lists to store the sentiment scores in
san = SentimentIntensityAnalyzer()
pos = []
neu = []
neg = []
compound = []
# Get sentiment scores from Vader Model for every open-text response
for i in range(len(all_data)):
sent_dict = san.polarity_scores(all_data['feedback'][i])
pos.append(sent_dict.get('pos'))
neu.append(sent_dict.get('neu'))
neg.append(sent_dict.get('neg'))
compound.append(sent_dict.get('compound'))
all_data['Positive Sentiment Score'] = pos
all_data['Neutral Sentiment Score'] = neu
all_data['Negative Sentiment Score'] = neg
all_data['Overall Sentiment Score'] = compound
# Take a look
all_data[::910]
# Describe
all_data.describe()
# Proportion of responses counted as positive and negative
print("Proportion Positive: ", round((len(all_data[all_data['Overall Sentiment Score']>=0.50])/len(all_data)), 2))
print("Proportion Negative: ", round((len(all_data[all_data['Overall Sentiment Score']<=-0.50])/len(all_data)), 2))
temp_df = all_data[all_data['Overall Sentiment Score']>=0]
temp_df = temp_df[temp_df['Overall Sentiment Score']<=0.5]
print("Proportion Neutral: ", round((len(temp_df)/len(all_data)), 2))
26/41
# Print statistical tests of skewness for the sentiment distributions
for i in ['Positive Sentiment Score', 'Negative Sentiment Score',
'Neutral Sentiment Score', 'Overall Sentiment Score']:
temp_df = all_data[i]
print(i, skew(temp_df), '\n', skewtest(temp_df), '\n')
# Histograms of each
# Quick plots of Vader sentiment before and after
fig, axs = plt.subplots(2, 2, figsize=(15, 10), sharey=False, sharex=False)
sns.histplot(x=all_data['Positive Sentiment Score'], ax=axs[0, 0], stat='percent')
sns.histplot(x=all_data['Neutral Sentiment Score'], ax=axs[0, 1], stat='percent')
sns.histplot(x=all_data['Negative Sentiment Score'], ax=axs[1, 0], stat='percent')
sns.histplot(x=all_data['Overall Sentiment Score'], ax=axs[1, 1], stat='percent')
# Upper left plot
axs[0, 0].set_ylabel('')
axs[0, 0].set_xlabel('')
fig.text(0.25, 0.85, 'Positive Sentiment Scores')
# Upper right plot
axs[0, 1].set_ylabel('')
axs[0, 1].set_xlabel('')
fig.text(0.6, 0.85, 'Neutral Sentiment Scores')
# Lower left plot
axs[1, 0].set_ylabel('')
axs[1, 1].set_ylabel('')
fig.text(0.25, 0.425, 'Negative Sentiment Scores')
# Lower right plot
axs[1, 0].set_xlabel('')
axs[1, 1].set_xlabel('')
fig.text(.6, 0.425, 'Overall Sentiment Scores')
# Main plot
fig.text(0.35, 0.0, 'Sentiment Score', fontsize=40)
fig.text(0.04, 0.15, 'Percentage of Respondents', fontsize=40, rotation='vertical')
# Save it
plt.savefig(fname='../figures/hist_vader.png', bbox_inches='tight')
# Show it
plt.show()
# Violinplot of each
# Quick plots of Vader sentiment before and after
fig, axs = plt.subplots(2, 2, figsize=(15, 10), sharey=False, sharex=False)
sns.violinplot(x=all_data['Positive Sentiment Score'], ax=axs[0, 0], stat='percentage')
sns.violinplot(x=all_data['Neutral Sentiment Score'], ax=axs[0, 1], stat='percentage')
sns.violinplot(x=all_data['Negative Sentiment Score'], ax=axs[1, 0], stat='percentage')
sns.violinplot(x=all_data['Overall Sentiment Score'], ax=axs[1, 1], stat='percentage')
# Upper left plot
axs[0, 0].set_ylabel('')
axs[0, 0].set_xlabel('')
fig.text(0.25, 0.85, 'Positive Sentiment Scores')
# Upper right plot
axs[0, 1].set_ylabel('')
axs[0, 1].set_xlabel('')
fig.text(0.6, 0.85, 'Neutral Sentiment Scores')
# Lower left plot
axs[1, 0].set_ylabel('')
axs[1, 1].set_ylabel('')
fig.text(0.25, 0.425, 'Negative Sentiment Scores')
# Lower right plot
axs[1, 0].set_xlabel('')
axs[1, 1].set_xlabel('')
fig.text(.6, 0.425, 'Overall Sentiment Scores')
# Main plot
fig.text(0.35, 0.0, 'Sentiment Score', fontsize=40)
fig.text(0.04, 0.15, 'Percentage of Respondents', fontsize=40, rotation='vertical')
# Save it
plt.savefig(fname='../figures/violin_vader.png', bbox_inches='tight')
# Show it
plt.show()
# Make sure everything in the col is showing as a string
all_data['feedback'] = all_data['feedback'].astype(str)
# Lower case all text
all_data['feedback_clean'] = all_data['feedback'].str.lower()
all_data[::910]
# Remove stopwords
stop_words = stopwords.words('english') + ['albert', 'centria', 'lauren', 'julien', 'sonnenberg', 'kailee', 'ahss', 'moments', 'bca', 'nov 06 2019', 'evan'] # Add some troublesome words to remove
all_data['feedback_clean'] = all_data['feedback_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
all_data[::910]
# Handle variations present in our text the nltk algorithms don't catch out-of-the-box
punct = set(string.punctuation)
fdbk_cln = []
for i in range(len(all_data)):
save_list = []
temp_list = all_data['feedback_clean'][i]
temp_list = temp_list.replace('team.', 'team')
temp_list = temp_list.replace('son\'s', 'son')
temp_list = temp_list.replace('family.', 'family')
temp_list = temp_list.replace('getting', 'get')
temp_list = temp_list.replace('receiving', 'receive')
temp_list = temp_list.replace('training', 'train')
temp_list = temp_list.replace('there.', 'there')
temp_list = temp_list.replace('program.', 'program')
temp_list = temp_list.replace('months.', 'month')
temp_list = temp_list.replace('provider.', 'provider')
temp_list = temp_list.replace('son.', 'son')
temp_list = temp_list.replace('sons', 'son')
temp_list = temp_list.replace('therapists.', 'therapist')
temp_list = temp_list.replace('worked', 'work')
temp_list = temp_list.replace('therapists', 'therapist')
temp_list = temp_list.replace('son', 'child')
temp_list = temp_list.replace('daughter', 'child')
temp_list = temp_list.replace('moments', '')
for ele in punct:
temp_list = temp_list.replace(ele, '')
fdbk_cln.append(temp_list)
# Add clean data to the all_data df
all_data['feedback_clean'] = fdbk_cln
all_data[::910]
# Remove top 10% most common words
' '.join(fdbk_cln)
new_str = ''
for i in range(len(fdbk_cln)):
new_str = new_str + fdbk_cln[i]
all_words = nltk.tokenize.word_tokenize(new_str)
all_word_dist = nltk.FreqDist(w.lower() for w in all_words)
cutoff = int(len(all_word_dist)*0.1)
most_common = all_word_dist.most_common(cutoff)
most_common.extend(['ve', 'll'])
all_data['feedback_trimmed'] = all_data['feedback_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (most_common)]))
# Tokenize text
all_data['feedback_trimmed'] = all_data['feedback_trimmed'].apply(lambda x: x.split())
all_data[::910]
# Lists to store our ngram tokens
bigrams = []
trigrams = []
# Loop through the dataset and get bigrams and trigrams for each open-text response
for i in range(len(all_data)):
text = all_data['feedback_trimmed'][i] # Isolate the response we're interested in
# Get bigrams if there are enough words available
try:
bg = [' '.join(e) for e in ngrams(text, 2)]
except:
bg = []
bigrams.append(bg)
# Get trigrams if there are enough words available
try:
tg = [' '.join(e) for e in ngrams(text, 3)]
except:
tg = []
trigrams.append(tg)
# Add lists to our working dataframe
all_data['feedback_bigram'] = bigrams
all_data['feedback_trigram'] = trigrams
# Save it
all_data.to_csv('../data/04_feature/cleaned_text_with_features.csv')
# View it
all_data[::910]
# Create dataframe with a trimmed down version of the cols we need
df = pd.DataFrame({'feedback':all_data['feedback_trimmed'],
'bigrams':all_data['feedback_bigram'],
'trigrams':all_data['feedback_trigram'],
'sentiment':all_data['Overall Sentiment Score']})
df = df[df['feedback'].map(lambda d: len(d)) > 0]
df = df.reset_index(drop=True)
# Make sure the cols are numeric
df['sentiment'] = df['sentiment'].astype(float)
df[::875]
# Cleaning out some sticklers
remove_list = ['nov 06 2019', 'south suburbs needs', 'side chicago south', 'south side chicago',
'suburbs needs open', 'center south side', 'need center south', 'something south partner',
'south partner existing', 'chicago south suburbs']
new_col = []
for i in range(len(df)):
temp_list = []
sent = df['trigrams'][i]
for word in sent:
if word in remove_list:
continue
else:
temp_list.append(word)
new_col.append(temp_list)
df['trigram_clean'] = new_col
df[::875]
# Create positive and negative dfs
pos_df = df[df['sentiment']>=0.50]
pos_df = pos_df.reset_index(drop=True)
neg_df = df[df['sentiment']<=-0.50]
neg_df = neg_df.reset_index(drop=True)
# All Unigrams
ngram_analysis(df=df, ngram_col='feedback', num_n=1, title='Bag-of-Words: Overall')
# Positive
ngram_analysis(df=pos_df, ngram_col='feedback', num_n=1, title='Bag-of-Words: Positive')
# Negative
ngram_analysis(df=neg_df, ngram_col='feedback', num_n=1, title='Bag-of-Words: Negative')
# All Bigrams
ngram_analysis(df=df, ngram_col='bigrams', num_n=1, title='Bigrams: Overall', figsize=(22, 15))
# Positive
ngram_analysis(df=pos_df, ngram_col='bigrams', num_n=1, title='Bigrams: Positive', figsize=(22, 15))
# Negative
ngram_analysis(df=neg_df, ngram_col='bigrams', num_n=1, title='Bigrams: Negative', figsize=(22, 15))
# All Trigrams
ngram_analysis(df=df, ngram_col='trigram_clean', num_n=1, title='Trigrams: Overall', figsize=(26, 15))
# Positive
ngram_analysis(df=pos_df, ngram_col='trigram_clean', num_n=1, title='Trigrams: Positive', figsize=(26, 15))
# Negative
ngram_analysis(df=neg_df, ngram_col='trigram_clean', num_n=1, title='Trigrams: Negative', figsize=(26, 15))
# All responses: LDA Grid
all_lda = lda_model_grid(df=df, col='feedback')
line_plot(df=all_lda, col='complexity', save_name='complexity_all')
line_plot(df=all_lda, col='coherence', save_name='coherence_all')
# All responses: Single Model
all_topics = lda_one_topic(df=df, col='feedback', topics=6,
savename='all_responses', grid_cols=2)
# All responses: Bigrams LDA Grid
all_lda = lda_model_grid(df=df, col='bigrams')
line_plot(df=all_lda, col='complexity', save_name='complexity_all')
line_plot(df=all_lda, col='coherence', save_name='coherence_all')
# All responses: Bigrams single model
all_topics = lda_one_topic(df=df, col='bigrams', topics=6,
savename='all_bigrams', grid_cols=2)
# All responses: Trigrams LDA Grid
all_lda = lda_model_grid(df=df, col='trigrams')
line_plot(df=all_lda, col='complexity', save_name='complexity_all')
line_plot(df=all_lda, col='coherence', save_name='coherence_all')
# All responses: Trigrams single model
all_topics = lda_one_topic(df=df, col='trigrams', topics=4,
savename='all_trigrams', grid_cols=2)
# Positive responses: LDA Grid
pos_20_lda = lda_model_grid(df=pos_df, col='feedback')
line_plot(df=pos_20_lda, col='complexity', save_name='complexity_pos20')
line_plot(df=pos_20_lda, col='coherence', save_name='coherence_pos_20')
# Positive responses: Single model
p20_topics = lda_one_topic(df=pos_df, col='feedback', topics=5,
savename='positive_all', grid_cols=2, fig_height=7.5)
# Positive responses: Bigrams LDA Grid
pos_20_lda = lda_model_grid(df=pos_df, col='bigrams')
line_plot(df=pos_20_lda, col='complexity', save_name='complexity_pos20')
line_plot(df=pos_20_lda, col='coherence', save_name='coherence_pos_20')
# Positive responses: Bigrams single model
p20_topics = lda_one_topic(df=pos_df, col='bigrams', topics=7,
savename='positive_bigrams', grid_cols=2, fig_height=6)
# Positive responses: Trigrams LDA Grid
pos_20_lda = lda_model_grid(df=pos_df, col='trigrams')
line_plot(df=pos_20_lda, col='complexity', save_name='complexity_pos20')
line_plot(df=pos_20_lda, col='coherence', save_name='coherence_pos_20')
# Positive responses: Trigrams single model
p20_topics = lda_one_topic(df=pos_df, col='trigrams', topics=2,
savename='positive_trigrams', grid_cols=2)
# Negative responses: LDA Grid
neg_20_lda = lda_model_grid(df=neg_df, col='feedback')
line_plot(df=neg_20_lda, col='complexity', save_name='complexity_neg_20')
line_plot(df=neg_20_lda, col='coherence', save_name='coherence_neg_20')
# Negative responses: Single model
p21_topics = lda_one_topic(df=neg_df, col='feedback', topics=4,
savename='negative_bigrams', grid_cols=2)
# Negative responses: Bigrams LDA Grid
neg_20_lda = lda_model_grid(df=neg_df, col='bigrams')
line_plot(df=neg_20_lda, col='complexity', save_name='complexity_neg_20')
line_plot(df=neg_20_lda, col='coherence', save_name='coherence_neg_20')
# Negative responses: Bigrams single model
p21_topics = lda_one_topic(df=neg_df, col='bigrams', topics=4,
savename='negative_bigrams', grid_cols=2, fig_height=6)
# Negative responses: Trigrams LDA Grid
neg_20_lda = lda_model_grid(df=neg_df, col='trigrams')
line_plot(df=neg_20_lda, col='complexity', save_name='complexity_neg_20')
line_plot(df=neg_20_lda, col='coherence', save_name='coherence_neg_20')
# Negative responses: Trigrams single model
p21_topics = lda_one_topic(df=neg_df, col='trigrams', topics=4,
savename='negative_trigrams', grid_cols=2)